In [50]:
print("Hello")
Hello
Step 1: Load the dataset into Python
In [51]:
import pandas as pd
# Load your dataset
df = pd.read_csv("C://Users//sangr//Downloads//new_credit_risk_dataset.csv")
# Quick look at the data
print(df.head())
print(df.info())
print(df.isnull().sum()) # check missing values
person_age person_income person_home_ownership person_emp_length \ 0 22 59000 RENT 1230.0 1 21 9600 OWN 50.0 2 25 9600 MORTGAGE 10.0 3 23 65500 RENT 40.0 4 24 54400 RENT 80.0 loan_intent loan_grade loan_amnt loan_int_rate loan_status \ 0 PERSONAL D 35000 16.02 1 1 EDUCATION B 1000 11.14 0 2 MEDICAL C 5500 12.87 1 3 MEDICAL C 35000 15.23 1 4 MEDICAL C 35000 14.27 1 loan_percent_income cb_person_default_on_file cb_person_cred_hist_length 0 0.59 Y 3 1 0.10 N 2 2 0.57 N 3 3 0.53 N 2 4 0.55 Y 4 <class 'pandas.core.frame.DataFrame'> RangeIndex: 32581 entries, 0 to 32580 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 person_age 32581 non-null int64 1 person_income 32581 non-null int64 2 person_home_ownership 32581 non-null object 3 person_emp_length 31686 non-null float64 4 loan_intent 32581 non-null object 5 loan_grade 32581 non-null object 6 loan_amnt 32581 non-null int64 7 loan_int_rate 29465 non-null float64 8 loan_status 32581 non-null int64 9 loan_percent_income 32581 non-null float64 10 cb_person_default_on_file 32581 non-null object 11 cb_person_cred_hist_length 32581 non-null int64 dtypes: float64(3), int64(5), object(4) memory usage: 3.0+ MB None person_age 0 person_income 0 person_home_ownership 0 person_emp_length 895 loan_intent 0 loan_grade 0 loan_amnt 0 loan_int_rate 3116 loan_status 0 loan_percent_income 0 cb_person_default_on_file 0 cb_person_cred_hist_length 0 dtype: int64
Define Features & Target
In [52]:
# Define target and features
X = df.drop(columns=["loan_status"]) # features
y = df["loan_status"] # target
Train-Test Split
In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
Evaluate Model
In [54]:
# Import necessary libraries
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Assuming X_train, X_test, y_train, y_test are already defined
# First, identify which columns contain categorical data
# For this example, let's assume we need to find the column index for 'MORTGAGE'
# You should replace this with code that identifies all categorical columns in your dataset
import pandas as pd
# Assuming X_train is a DataFrame
# Find categorical columns (columns with object or category dtype)
categorical_columns = []
for i, dtype in enumerate(X_train.dtypes):
if dtype == 'object' or dtype.name == 'category':
categorical_columns.append(i)
# If X_train is not a DataFrame but a numpy array, you'll need to specify the indices manually
# For example, if you know 'MORTGAGE' is in column 3:
# categorical_columns = [3] # Add all categorical column indices here
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
],
remainder='passthrough' # Keep other columns as they are
)
# Create the model pipeline
model = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
Accuracy: 0.9330980512505754
Classification Report:
precision recall f1-score support
0 0.93 0.99 0.96 5095
1 0.97 0.72 0.82 1422
accuracy 0.93 6517
macro avg 0.95 0.86 0.89 6517
weighted avg 0.94 0.93 0.93 6517
Confusion Matrix:
[[5059 36]
[ 400 1022]]
Improvements¶
Try Tree-Based Models¶
In [57]:
# First, install the xgboost package
!pip install xgboost
# Then import the required libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Assuming X_train, X_test, y_train, y_test are already defined
# First, identify categorical columns (this is an example - adjust based on your actual data)
# Let's assume 'MORTGAGE' is in a column named 'home_ownership'
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Create preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', 'passthrough', numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# Random Forest with preprocessing
rf_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("\nRandom Forest Report:\n", classification_report(y_test, rf_pred))
# XGBoost with preprocessing
xgb_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])
xgb_pipeline.fit(X_train, y_train)
xgb_pred = xgb_pipeline.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print("\nXGBoost Report:\n", classification_report(y_test, xgb_pred))
Requirement already satisfied: xgboost in c:\user\p\lib\site-packages (3.0.5)
Requirement already satisfied: numpy in c:\user\p\lib\site-packages (from xgboost) (2.1.3)
Requirement already satisfied: scipy in c:\user\p\lib\site-packages (from xgboost) (1.15.3)
Random Forest Accuracy: 0.9340187202700629
Random Forest Report:
precision recall f1-score support
0 0.93 0.99 0.96 5095
1 0.97 0.72 0.83 1422
accuracy 0.93 6517
macro avg 0.95 0.86 0.89 6517
weighted avg 0.94 0.93 0.93 6517
XGBoost Accuracy: 0.9350928341261316
XGBoost Report:
precision recall f1-score support
0 0.93 0.99 0.96 5095
1 0.96 0.73 0.83 1422
accuracy 0.94 6517
macro avg 0.94 0.86 0.90 6517
weighted avg 0.94 0.94 0.93 6517
C:\User\p\Lib\site-packages\xgboost\training.py:183: UserWarning: [13:17:03] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738:
Parameters: { "use_label_encoder" } are not used.
bst.update(dtrain, iteration=i, fobj=obj)
In [105]:
#Handle Class Imbalance
In [ ]:
In [59]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer # Import SimpleImputer for handling missing values
# Assuming X_train and X_test are pandas DataFrames
# First, identify categorical columns (those with string values like 'MORTGAGE')
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X_train.select_dtypes(exclude=['object']).columns.tolist()
# Create a preprocessor that will handle both numerical and categorical features
# Add imputation steps to handle missing values
preprocessor = ColumnTransformer(
transformers=[
# For numerical columns: first impute missing values with the mean
('num', Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('passthrough', 'passthrough')
]), numerical_columns),
# For categorical columns: first impute missing values with the most frequent value, then one-hot encode
('cat', Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
]), categorical_columns)
])
# Create a pipeline that first preprocesses the data, then applies the model
model_balanced = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(max_iter=1000, class_weight="balanced"))
])
# Fit the pipeline to the training data
model_balanced.fit(X_train, y_train)
# Make predictions
y_pred_bal = model_balanced.predict(X_test)
# Evaluate the model
print("Balanced Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_bal))
print("\nReport:\n", classification_report(y_test, y_pred_bal))
Balanced Logistic Regression Accuracy: 0.7759705385913764
Report:
precision recall f1-score support
0 0.92 0.78 0.85 5095
1 0.49 0.76 0.60 1422
accuracy 0.78 6517
macro avg 0.71 0.77 0.72 6517
weighted avg 0.83 0.78 0.79 6517
C:\User\p\Lib\site-packages\sklearn\linear_model\_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Feature Importance
Random Forest / XGBoost feature importances:
In [61]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
# Identify categorical & numeric columns
cat_cols = X_train.select_dtypes(include=['object']).columns
num_cols = X_train.select_dtypes(exclude=['object']).columns
# Preprocessor
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
('num', 'passthrough', num_cols)
]
)
# Pipeline
rf_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', RandomForestClassifier(random_state=42))
])
# Fit model
rf_pipeline.fit(X_train, y_train)
# Get feature importances
importances = rf_pipeline.named_steps['model'].feature_importances_
feature_names = rf_pipeline.named_steps['preprocessor'].get_feature_names_out()
feat_importances = pd.Series(importances, index=feature_names)
print(feat_importances.sort_values(ascending=False).head(10))
num__loan_percent_income 0.227442 num__person_income 0.139640 num__loan_int_rate 0.108983 num__loan_amnt 0.069097 num__person_emp_length 0.062406 cat__loan_grade_D 0.059890 cat__person_home_ownership_RENT 0.053760 num__person_age 0.043827 num__cb_person_cred_hist_length 0.034204 cat__person_home_ownership_MORTGAGE 0.027569 dtype: float64
In [62]:
#Tune Hyperparameters (GridSearchCV / RandomizedSearchCV)
In [63]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
xgb_pipeline = Pipeline(steps=[
('preprocessor', preprocessor), # your earlier ColumnTransformer
('model', XGBClassifier(
eval_metric='logloss',
use_label_encoder=False,
random_state=42
))
])
param_grid = {
'model__max_depth': [3, 5, 7],
'model__learning_rate': [0.01, 0.1, 0.2],
'model__n_estimators': [100, 200, 500],
'model__subsample': [0.8, 1.0],
'model__colsample_bytree': [0.8, 1.0]
}
grid = GridSearchCV(
xgb_pipeline,
param_grid,
scoring='recall',
cv=3,
verbose=2,
n_jobs=-1
)
grid.fit(X_train, y_train)
print("Best Parameters:", grid.best_params_)
print("Best Recall Score:", grid.best_score_)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
C:\User\p\Lib\site-packages\xgboost\training.py:183: UserWarning: [13:19:10] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738:
Parameters: { "use_label_encoder" } are not used.
bst.update(dtrain, iteration=i, fobj=obj)
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 500, 'model__subsample': 1.0}
Best Recall Score: 0.7525504789048091
In [ ]:
Evaluate on the Test Set¶
Now that you have tuned hyperparameters, retrain the model on the full training data and evaluate on the test data:
In [64]:
best_xgb = grid.best_estimator_
y_pred = best_xgb.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
Test Accuracy: 0.9361669479822003
Classification Report:
precision recall f1-score support
0 0.93 0.99 0.96 5095
1 0.94 0.75 0.84 1422
accuracy 0.94 6517
macro avg 0.94 0.87 0.90 6517
weighted avg 0.94 0.94 0.93 6517
Confusion Matrix:
[[5029 66]
[ 350 1072]]
In [65]:
##Feature Importance (XGBoost specific)
In [66]:
importances = best_xgb.named_steps['model'].feature_importances_
feature_names = best_xgb.named_steps['preprocessor'].get_feature_names_out()
feat_imp = pd.Series(importances, index=feature_names)
print(feat_imp.sort_values(ascending=False).head(15))
cat__loan_grade_D 0.137872 cat__person_home_ownership_RENT 0.110663 cat__person_home_ownership_OWN 0.097535 cat__loan_grade_E 0.066564 cat__loan_grade_C 0.061964 cat__loan_grade_G 0.053514 cat__loan_grade_A 0.052265 cat__loan_intent_DEBTCONSOLIDATION 0.048179 num__loan_percent_income 0.046322 cat__loan_grade_F 0.042818 cat__loan_intent_MEDICAL 0.038055 cat__loan_intent_HOMEIMPROVEMENT 0.033725 cat__loan_intent_VENTURE 0.033176 cat__loan_grade_B 0.024915 num__person_income 0.020623 dtype: float32
In [ ]:
In [67]:
import pandas as pd
# New applicant details
new_applicant = pd.DataFrame([{
"person_age": 30,
"person_income": 45000,
"person_home_ownership": "RENT",
"person_emp_length": 24, # months
"loan_intent": "PERSONAL",
"loan_grade": "C",
"loan_amnt": 8000,
"loan_int_rate": 12.5,
"loan_percent_income": 0.18,
"cb_person_default_on_file": "N",
"cb_person_cred_hist_length": 5
}])
In [68]:
#Use your trained pipeline to predict
In [70]:
xgb_pipeline.fit(X_train, y_train)
C:\User\p\Lib\site-packages\xgboost\training.py:183: UserWarning: [13:31:34] WARNING: C:\actions-runner\_work\xgboost\xgboost\src\learner.cc:738:
Parameters: { "use_label_encoder" } are not used.
bst.update(dtrain, iteration=i, fobj=obj)
Out[70]:
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
Index(['person_home_ownership', 'loan_intent', 'loan_grade',
'cb_person_default_on_file'],
dtype='object')),
('num', 'passthrough',
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
'loan_int_rate', 'loan_percent_income', 'cb_perso...
feature_types=None, feature_weights=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=None, n_jobs=None,
num_parallel_tree=None, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('cat',
OneHotEncoder(handle_unknown='ignore'),
Index(['person_home_ownership', 'loan_intent', 'loan_grade',
'cb_person_default_on_file'],
dtype='object')),
('num', 'passthrough',
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
'loan_int_rate', 'loan_percent_income', 'cb_perso...
feature_types=None, feature_weights=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=None, n_jobs=None,
num_parallel_tree=None, ...))])ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
Index(['person_home_ownership', 'loan_intent', 'loan_grade',
'cb_person_default_on_file'],
dtype='object')),
('num', 'passthrough',
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
dtype='object'))])Index(['person_home_ownership', 'loan_intent', 'loan_grade',
'cb_person_default_on_file'],
dtype='object')OneHotEncoder(handle_unknown='ignore')
Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
dtype='object')passthrough
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, feature_weights=None, gamma=None,
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, ...)In [71]:
# Predict class (0 = good loan, 1 = default)
prediction = xgb_pipeline.predict(new_applicant)
# Predict probability
probability = xgb_pipeline.predict_proba(new_applicant)
print("Prediction:", prediction[0])
print("Default Probability:", probability[0][1])
Prediction: 0 Default Probability: 0.043883912
Trying multiple applicant¶
In [72]:
new_applicants = pd.DataFrame([
{
"person_age": 30,
"person_income": 55000,
"person_home_ownership": "RENT",
"person_emp_length": 24,
"loan_intent": "EDUCATION",
"loan_grade": "B",
"loan_amnt": 10000,
"loan_int_rate": 12.5,
"loan_percent_income": 0.18,
"cb_person_default_on_file": "N",
"cb_person_cred_hist_length": 5
},
{
"person_age": 45,
"person_income": 30000,
"person_home_ownership": "MORTGAGE",
"person_emp_length": 60,
"loan_intent": "MEDICAL",
"loan_grade": "D",
"loan_amnt": 25000,
"loan_int_rate": 20.5,
"loan_percent_income": 0.5,
"cb_person_default_on_file": "Y",
"cb_person_cred_hist_length": 10
}
])
preds = xgb_pipeline.predict(new_applicants)
probs = xgb_pipeline.predict_proba(new_applicants)
results = pd.DataFrame({
"Prediction": preds,
"Default_Probability": probs[:, 1]
})
print(results)
Prediction Default_Probability 0 0 0.043685 1 1 0.960115
In [73]:
import joblib
joblib.dump(xgb_pipeline, "xgb_credit_model.pkl")
Out[73]:
['xgb_credit_model.pkl']
In [75]:
import pandas as pd
# Example: multiple new applicants
new_applicants = pd.DataFrame([
{
"person_age": 30,
"person_income": 55000,
"person_home_ownership": "RENT",
"person_emp_length": 24,
"loan_intent": "EDUCATION",
"loan_grade": "B",
"loan_amnt": 10000,
"loan_int_rate": 12.5,
"loan_percent_income": 0.18,
"cb_person_default_on_file": "N",
"cb_person_cred_hist_length": 5
},
{
"person_age": 45,
"person_income": 30000,
"person_home_ownership": "MORTGAGE",
"person_emp_length": 60,
"loan_intent": "MEDICAL",
"loan_grade": "D",
"loan_amnt": 25000,
"loan_int_rate": 20.5,
"loan_percent_income": 0.5,
"cb_person_default_on_file": "Y",
"cb_person_cred_hist_length": 10
},
{
"person_age": 22,
"person_income": 45000,
"person_home_ownership": "OWN",
"person_emp_length": 12,
"loan_intent": "PERSONAL",
"loan_grade": "A",
"loan_amnt": 5000,
"loan_int_rate": 10.5,
"loan_percent_income": 0.11,
"cb_person_default_on_file": "N",
"cb_person_cred_hist_length": 3
}
])
# Get predictions (0 = good loan, 1 = default)
preds = xgb_pipeline.predict(new_applicants)
# Get probabilities
probs = xgb_pipeline.predict_proba(new_applicants)
# Combine results into a DataFrame
results = new_applicants.copy()
results["Prediction"] = preds
results["Default_Probability"] = probs[:, 1]
print(results)
person_age person_income person_home_ownership person_emp_length \ 0 30 55000 RENT 24 1 45 30000 MORTGAGE 60 2 22 45000 OWN 12 loan_intent loan_grade loan_amnt loan_int_rate loan_percent_income \ 0 EDUCATION B 10000 12.5 0.18 1 MEDICAL D 25000 20.5 0.50 2 PERSONAL A 5000 10.5 0.11 cb_person_default_on_file cb_person_cred_hist_length Prediction \ 0 N 5 0 1 Y 10 1 2 N 3 0 Default_Probability 0 0.043685 1 0.960115 2 0.000190
In [76]:
import matplotlib.pyplot as plt
import numpy as np
# Plot default probabilities
plt.figure(figsize=(8,5))
applicants = np.arange(len(results)) # index for applicants
plt.bar(applicants, results["Default_Probability"], color="orange")
# Add labels
plt.xticks(applicants, [f"Applicant {i+1}" for i in applicants])
plt.ylabel("Default Probability")
plt.title("Predicted Default Risk for Applicants")
# Add values on top of bars
for i, prob in enumerate(results["Default_Probability"]):
plt.text(i, prob + 0.01, f"{prob:.2f}", ha="center", fontsize=10)
plt.ylim(0,1) # probability scale (0 to 1)
plt.show()
In [77]:
import joblib
# Save the trained pipeline
joblib.dump(xgb_pipeline, "credit_risk_model.pkl")
print("Model saved as credit_risk_model.pkl")
Model saved as credit_risk_model.pkl
In [79]:
def predict_applicant(applicant_data):
pred = xgb_pipeline.predict(applicant_data)[0]
prob = xgb_pipeline.predict_proba(applicant_data)[0][1]
result = "Default" if pred == 1 else "Good Loan"
return {"Prediction": result, "Default Probability": round(prob, 4)}
In [80]:
# Example applicant (must match feature order/columns of your dataset)
new_applicant = df.drop("loan_status", axis=1).iloc[[0]]
print(predict_applicant(new_applicant))
{'Prediction': 'Default', 'Default Probability': np.float32(0.9983)}
In [81]:
#Batch Prediction Function
In [82]:
import pandas as pd
def predict_batch(applicants_df):
"""
Takes a DataFrame of applicants (same structure as training data, without loan_status).
Returns predictions + default probabilities.
"""
preds = xgb_pipeline.predict(applicants_df)
probs = xgb_pipeline.predict_proba(applicants_df)[:, 1] # probability of default
results = applicants_df.copy()
results["Prediction"] = preds
results["Default_Probability"] = probs.round(4)
results["Prediction_Label"] = results["Prediction"].map({0: "Good Loan", 1: "Default"})
return results
In [83]:
# Example batch of applicants
new_applicants = pd.DataFrame([
{
"person_age": 30,
"person_income": 55000,
"person_home_ownership": "RENT",
"person_emp_length": 24,
"loan_intent": "EDUCATION",
"loan_grade": "B",
"loan_amnt": 10000,
"loan_int_rate": 12.5,
"loan_percent_income": 0.18,
"cb_person_default_on_file": "N",
"cb_person_cred_hist_length": 5
},
{
"person_age": 45,
"person_income": 30000,
"person_home_ownership": "MORTGAGE",
"person_emp_length": 60,
"loan_intent": "MEDICAL",
"loan_grade": "D",
"loan_amnt": 25000,
"loan_int_rate": 20.5,
"loan_percent_income": 0.50,
"cb_person_default_on_file": "Y",
"cb_person_cred_hist_length": 10
}
])
results = predict_batch(new_applicants)
print(results)
person_age person_income person_home_ownership person_emp_length \ 0 30 55000 RENT 24 1 45 30000 MORTGAGE 60 loan_intent loan_grade loan_amnt loan_int_rate loan_percent_income \ 0 EDUCATION B 10000 12.5 0.18 1 MEDICAL D 25000 20.5 0.50 cb_person_default_on_file cb_person_cred_hist_length Prediction \ 0 N 5 0 1 Y 10 1 Default_Probability Prediction_Label 0 0.0437 Good Loan 1 0.9601 Default
In [84]:
#Generate synthetic applicants
In [85]:
import numpy as np
import pandas as pd
np.random.seed(42) # reproducibility
# Define possible categories from your dataset
home_ownership = ["RENT", "MORTGAGE", "OWN", "OTHER"]
loan_intents = ["EDUCATION", "MEDICAL", "VENTURE", "PERSONAL", "DEBTCONSOLIDATION", "HOMEIMPROVEMENT"]
loan_grades = ["A", "B", "C", "D", "E", "F", "G"]
default_file = ["Y", "N"]
# Generate 5000 applicants
new_5000 = pd.DataFrame({
"person_age": np.random.randint(18, 70, 5000),
"person_income": np.random.randint(20000, 120000, 5000),
"person_home_ownership": np.random.choice(home_ownership, 5000),
"person_emp_length": np.random.randint(0, 240, 5000), # months
"loan_intent": np.random.choice(loan_intents, 5000),
"loan_grade": np.random.choice(loan_grades, 5000),
"loan_amnt": np.random.randint(1000, 40000, 5000),
"loan_int_rate": np.round(np.random.uniform(5, 30, 5000), 2), # interest rates
"loan_percent_income": np.round(np.random.uniform(0.05, 0.6, 5000), 2),
"cb_person_default_on_file": np.random.choice(default_file, 5000),
"cb_person_cred_hist_length": np.random.randint(1, 30, 5000)
})
print("✅ Synthetic dataset of 5000 applicants generated")
new_5000.head()
✅ Synthetic dataset of 5000 applicants generated
Out[85]:
| person_age | person_income | person_home_ownership | person_emp_length | loan_intent | loan_grade | loan_amnt | loan_int_rate | loan_percent_income | cb_person_default_on_file | cb_person_cred_hist_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | 21920 | OTHER | 113 | PERSONAL | F | 29194 | 27.20 | 0.37 | N | 13 |
| 1 | 69 | 97219 | OTHER | 132 | HOMEIMPROVEMENT | D | 34785 | 24.23 | 0.55 | Y | 29 |
| 2 | 46 | 96872 | MORTGAGE | 230 | MEDICAL | C | 38458 | 11.91 | 0.38 | N | 27 |
| 3 | 32 | 101132 | OWN | 74 | EDUCATION | F | 9373 | 11.04 | 0.18 | N | 7 |
| 4 | 60 | 22093 | RENT | 58 | EDUCATION | E | 31283 | 25.96 | 0.23 | N | 21 |
In [86]:
results_5000 = predict_batch(new_5000)
print(results_5000.head())
print("✅ Total applicants scored:", len(results_5000))
person_age person_income person_home_ownership person_emp_length \
0 56 21920 OTHER 113
1 69 97219 OTHER 132
2 46 96872 MORTGAGE 230
3 32 101132 OWN 74
4 60 22093 RENT 58
loan_intent loan_grade loan_amnt loan_int_rate loan_percent_income \
0 PERSONAL F 29194 27.20 0.37
1 HOMEIMPROVEMENT D 34785 24.23 0.55
2 MEDICAL C 38458 11.91 0.38
3 EDUCATION F 9373 11.04 0.18
4 EDUCATION E 31283 25.96 0.23
cb_person_default_on_file cb_person_cred_hist_length Prediction \
0 N 13 1
1 Y 29 1
2 N 27 0
3 N 7 0
4 N 21 1
Default_Probability Prediction_Label
0 0.9850 Default
1 0.9144 Default
2 0.2375 Good Loan
3 0.0004 Good Loan
4 0.9815 Default
✅ Total applicants scored: 5000
In [87]:
summary = results_5000["Prediction_Label"].value_counts(normalize=True) * 100
print("📊 Portfolio Risk Distribution (%):\n", summary)
📊 Portfolio Risk Distribution (%): Prediction_Label Default 55.64 Good Loan 44.36 Name: proportion, dtype: float64
In [88]:
results_5000.to_csv("synthetic_credit_risk_5000.csv", index=False)
print("✅ Results saved to synthetic_credit_risk_5000.csv")
✅ Results saved to synthetic_credit_risk_5000.csv
In [ ]:
In [89]:
#Risk Distribution (Good vs Default)
In [90]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(6,4))
sns.countplot(x="Prediction_Label", data=results_5000, palette="Set2")
plt.title("Portfolio Risk Distribution (5000 Applicants)")
plt.xlabel("Loan Outcome")
plt.ylabel("Number of Applicants")
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\1746783086.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(x="Prediction_Label", data=results_5000, palette="Set2")
In [91]:
#Default Probability Histogram
In [92]:
plt.figure(figsize=(8,5))
sns.histplot(results_5000["Default_Probability"], bins=20, kde=True, color="orange")
plt.title("Distribution of Default Probabilities")
plt.xlabel("Default Probability")
plt.ylabel("Number of Applicants")
plt.show()
In [93]:
#Age vs Default Risk
In [94]:
plt.figure(figsize=(8,5))
sns.boxplot(x="Prediction_Label", y="person_age", data=results_5000, palette="coolwarm")
plt.title("Age Distribution by Loan Outcome")
plt.xlabel("Loan Outcome")
plt.ylabel("Age")
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\832474758.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x="Prediction_Label", y="person_age", data=results_5000, palette="coolwarm")
In [95]:
#Income vs Default Risk
In [96]:
plt.figure(figsize=(8,5))
sns.boxplot(x="Prediction_Label", y="person_income", data=results_5000, palette="viridis")
plt.title("Income Distribution by Loan Outcome")
plt.xlabel("Loan Outcome")
plt.ylabel("Income")
plt.ylim(0, 150000)
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\86574630.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x="Prediction_Label", y="person_income", data=results_5000, palette="viridis")
In [97]:
#oan Amount vs Default Risk
In [98]:
plt.figure(figsize=(8,5))
sns.boxplot(x="Prediction_Label", y="loan_amnt", data=results_5000, palette="mako")
plt.title("Loan Amount Distribution by Loan Outcome")
plt.xlabel("Loan Outcome")
plt.ylabel("Loan Amount")
plt.show()
C:\Users\sangr\AppData\Local\Temp\ipykernel_18364\277379129.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x="Prediction_Label", y="loan_amnt", data=results_5000, palette="mako")
In [99]:
pip install plotly
Requirement already satisfied: plotly in c:\user\p\lib\site-packages (5.24.1) Requirement already satisfied: tenacity>=6.2.0 in c:\user\p\lib\site-packages (from plotly) (9.0.0) Requirement already satisfied: packaging in c:\user\p\lib\site-packages (from plotly) (24.2) Note: you may need to restart the kernel to use updated packages.
In [100]:
import plotly.express as px
fig = px.histogram(
results_5000,
x="Prediction_Label",
color="Prediction_Label",
title="Portfolio Risk Distribution (5000 Applicants)",
text_auto=True
)
fig.show()
In [101]:
fig = px.histogram(
results_5000,
x="Default_Probability",
nbins=30,
title="Distribution of Default Probabilities",
color="Prediction_Label"
)
fig.show()
In [102]:
fig = px.bar(
results_5000.groupby("loan_intent")["Prediction"].mean().reset_index(),
x="loan_intent",
y="Prediction",
title="Default Rate by Loan Intent",
color="loan_intent"
)
fig.show()
In [103]:
fig = px.scatter(
results_5000,
x="person_income",
y="Default_Probability",
color="Prediction_Label",
size="loan_amnt",
hover_data=["loan_intent", "loan_grade"],
title="Income vs Default Probability (Loan Size = Bubble)"
)
fig.show()
In [104]:
fig = px.box(
results_5000,
x="Prediction_Label",
y="person_age",
color="Prediction_Label",
title="Age Distribution by Loan Outcome"
)
fig.show()
In [ ]: